In [1]:

    
import urllib2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import metrics
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup

%matplotlib inline



In [2]:

    
def get_tag_name(tag_url):
    """Takes a Atlas Obscura db dump and turns the tag_url into bare tags."""
    return tag_url.split('/')[2]

def get_dummies_and_join(df,var_column):
    """gets dummies for a var_column in a dataframe and returns a dataframe with
    those dummies joined on."""
    df = df.join(pd.get_dummies(df[var_column]))
    return df

def unique_article_set(df,column):
    """Takes a DataFrame and a column to groupby on and returns the DF, using
    sum as the agfunc for the remaining columns. Typically this will be a
    list of articles containing duplicates with an array of dummy vars
    attached to it. Column is a string, df is a pandas DataFrame"""
    return df.groupby(column).sum()

##Skipped over the Time Series section. Should come later.

def append_atlas_url(prefix,column):
    """Adds a given prefix to items in a Series and returns a list with the
    prefix added to each item. Typically should be set equal to the input
    Series so as to assign the new list back to the DataFrame."""
    return [prefix + x for x in column]

def get_simplereach_url(x):
    if type(x) == str:
        return x.split('http://')[1]
    else:
        pass

def target_pageview_cutoff(pageviews,pv_column):
    """Creates a target vector of categorical values (1,0) based on whether a
    particular article is in excess of the PV cutoff."""
    return [1 if x > pageviews else 0 for x in pv_column]

def get_year(df,date_column):
    """Returns a Series of just the year based on a date column passed into the
    function."""
    return pd.DatetimeIndex(df[date_column]).year

def get_total_tagged(df,new_col_name):
    """Takes the articles and tags dataframe, sums the tag vectors, and Creates
    a new dataframe containing the tags and the number of articles tagged."""
    total_tagged = pd.DataFrame(data=df.sum(),columns=[new_col_name])
    total_tagged = total_tagged.sort_values('num_tagged',ascending=False).drop(
                                                        'pageviews',
                                                        axis=0,
                                                        )
    return total_tagged

def get_interaction_features(df,interaction_tags,total_tagged,tag_threshold):
    """Takes the articles and tags DataFrame and creates binary interaction
    features using a list passed as interaction_tags as the tags to be paired
    with the rest of the set. DataFrame must only contain index and
    categorical features. DataFrame will return interaction features only for
    tags having article counts above the threshold set as tag_threshold."""
    for item in regular_features:
        for column in df.drop(total_tagged[total_tagged.num_tagged < tag_threshold]).columns:
            interactions[(item + '_' + column)] = df[item] + df[column]
    return interactions

def correct_values(interactions_df):
    """Changes any values of 2 to 1 and 1s to 0s. This is because the dataframe
    of interactions is created in get_interaction_features() by adding two columns
    together to generate a composite column that represents an interaction
    feature. Where there's a 2, that article has both tags. This should be 1.
    Where there's a 1, the article only has 1 of the tags, and is falsey for that
    tag."""
    def two_to_one(x):
        if x == 2.0:
            return 1
        else:
            return 0
    for x in interactions_df.columns:
        interactions_df[x] = interactions_df[item].apply(two_to_one)
    return interactions_df

def drop_identity_tags(interactions_df):
    """Takes the interaction dataframe and drops the tags where the interaction_tags
    paired to one another."""
    tagged_total = get_total_tagged(interactions_df,'num_tagged')
    interactions_df = interactions_df.drop(tagged_total[0:26].index,axis=1)
    return interaction_df

def drop_zero_cols(df):
    """Takes a dataframe and drops any column that sums to zero. This is used to
    drop categorical variables that not a single datapoint are positive for. cols
    should only be categorical variables encoded (0,1)."""
    df = df.fillna(0)
    for item in df.columns:
        if df[item].sum() == 0:
            df = df.drop(item,axis=1)
        else:
            continue
    return df

def drop_tag_features(df,tag_threshold):
    """Takes a dataframe of articles and tag features and returns a subset of df
    that contains only the tags that have corresponding article counts above the
    tag_threshold."""
    total_tagged = get_total_tagged(df,'num_tagged')
    df2 = df.drop(total_tagged[total_tagged.num_tagged < tag_threshold].index,axis=1)
    return df2

def get_cross_validation_score(X,y,estimator,n_folds):
    """Dependent on sklearn cross_validation and numpy. Takes an X and y and returns the cross_validation accuracy score for a given linear_model.estimator."""
    kf = cross_validation.KFold(len(X),n_folds=n_folds)
    scores = []
    for train_index, test_index in kf:
        lr = estimator.fit(X.iloc[train_index],y.iloc[train_index])
        scores.append(lr.score(X.iloc[test_index],y.iloc[test_index]))
    return np.mean(scores)

def get_probabilities(model,X):
    """Takes an estimator model that has been fix and the X used in that model
    and returns the probabilities from predict_proba with the original feature
    names zipped back up with them."""
    interactions_model_scores = model.predict_proba(X)[:,1]
    interactions_probabilities = pd.DataFrame(zip(X.columns,interactions_model_scores),columns=['tags','probabilities'])
    interactions_probabilities = interactions_probabilities.sort_values('probabilities',ascending=False)
    return interactions_probabilities

def get_sub_tags(df):
    df = df.reset_index()
    df['subtag'] = df.tags.apply(lambda x: x.split('_')[1])
    return df

def get_pageviews(probabilities,df):
    """Sums the pageviews of all articles corresponding to a tag and returns the probabilities dataframe with a new column of total PVs for each tag."""
    probabilities['pageview'] = [sum(df['pageviews'][df[item] == 1]) for item in probabilities['tags']]
    return probabilities

def get_roc_scores(y,scores):
    return metrics.roc_auc_score(y,scores)

def get_article_tags(url):
    """Uses BeautifulSoup to scrape Atlas pages for our tags"""
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page, 'lxml')
    if "/articles/" in url:
        taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
    elif "/places" in url:
        taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
    else:
        taglist = ""
    for count,item in enumerate(taglist):
        taglist[count] = item.split('#')[1]
    return url, taglist

def transform_article_for_prediction(url,whole_set):
    url, taglist = get_article_tags(url)
    tags = {}
    for item in taglist:
        tags[item] = [1]
    article_tag_vectors = pd.DataFrame(tags)
    article_tag_vectors = article_tag_vectors.reindex(columns=whole_set.columns,
    fill_value=0)
    return article_tag_vectors

First we import the table of tag-article mappings from our SQL database

(but read in as a .csv).



In [3]:

    
df = pd.read_csv('atlas-taggings.csv')



In [4]:

    
df[2:5]









    Out[4]:






  
    
      
      tag_id
      tag_url
      tagged_type
      tagged_id
      tagged_url
    
  
  
    
      2
      2
      www.atlasobscura.com/categories/panoramas
      Place
      6431
      www.atlasobscura.com/places/gettysburg-cyclorama
    
    
      3
      2
      www.atlasobscura.com/categories/panoramas
      Article
      2311
      www.atlasobscura.com/articles/rip-gettysburg-c...
    
    
      4
      258
      www.atlasobscura.com/categories/bridges
      Place
      10134
      www.atlasobscura.com/places/gimbel-s-bridge

We only care about the content type "Article"



In [5]:

    
articles = df[df.tagged_type == 'Article']

But we need to get the tag name out of the url string for the tag



In [6]:

    
articles.tag_url = articles.tag_url.apply(get_tag_name)









    



/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:2698: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value



In [7]:

    
articles = get_dummies_and_join(articles,'tag_url')
articles = articles.drop(['tag_id','tag_url','tagged_type','tagged_id'],axis=1)



In [8]:

    
articles = unique_article_set(articles,'tagged_url')



In [9]:

    
articles = articles.reset_index().set_index('tagged_url')

Import the table of URLs and total pageviews



In [10]:

    
pageviews = pd.read_csv('output_articles_performance.csv',header=None,names=[
        'url','published','pageviews'
    ])



In [11]:

    
pageviews.url = ['www.atlasobscura.com/articles/' + x for x in pageviews.url]



In [12]:

    
pageviews.describe()









    Out[12]:






  
    
      
      pageviews
    
  
  
    
      count
      3446.000000
    
    
      mean
      7052.891759
    
    
      std
      23256.215270
    
    
      min
      1.000000
    
    
      25%
      1150.250000
    
    
      50%
      2571.500000
    
    
      75%
      5834.750000
    
    
      max
      621494.000000



In [13]:

    
pageviews.set_index('url',inplace=True)
article_set = articles.join(pageviews)



In [ ]:



In [14]:

    
article_set['ten_thousand'] = target_pageview_cutoff(10000,article_set.pageviews)



In [15]:

    
article_set['published'] = pd.to_datetime(article_set['published'])
article_set['year'] = get_year(article_set,'published')



In [54]:

    
article_set.pageviews.plot(kind='density',title='Page View Distribution, All Articles')









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-54-fb377341b70c> in <module>()
----> 1 article_set.log(pageviews).plot(kind='density',title='Page View Distribution, All Articles')

/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   2667             if name in self._info_axis:
   2668                 return self[name]
-> 2669             return object.__getattribute__(self, name)
   2670 
   2671     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'log'



In [17]:

    
ax = article_set.boxplot(column='pageviews',by='year',figsize=(6,6),showfliers=False)
ax.set(title='PV distribution by year of publication, no outliers',ylabel='pageviews')









    Out[17]:





[<matplotlib.text.Text at 0x119e4ac50>, <matplotlib.text.Text at 0x1062cc950>]



In [18]:

    
sns.factorplot(
    x='year',
    y='ten_thousand',
    data = article_set
    )









    Out[18]:





<seaborn.axisgrid.FacetGrid at 0x11aab0e90>



In [19]:

    
total_tagged = get_total_tagged(article_set,'num_tagged')



In [ ]:



In [20]:

    
article_set.fillna(value=0,inplace=True)



In [21]:

    
y = article_set.ten_thousand
X = article_set.drop(['pageviews','published','ten_thousand'],axis=1)



In [22]:

    
cross_val_score = get_cross_validation_score(X,y,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)



In [23]:

    
lr = linear_model.LogisticRegression(penalty = 'l1').fit(X,y)



In [24]:

    
lr_scores = lr.predict_proba(X)[:,1]
roc_score = get_roc_scores(y,lr_scores)
print roc_score









    



0.797464130142



In [25]:

    
single_tag_probabilities = get_probabilities(lr,X)

Now we will explore how a KNN classifier does with our dataset



In [26]:

    
params = {'n_neighbors' : [x for x in range(2,100,4)],
          'weights' : ['distance','uniform']}
gs = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
                  n_jobs=-1,cv=10,verbose=1)
gs.fit(X,y)
print gs.best_params_
print gs.best_score_

knn = gs.best_estimator_.fit(X,y)
knn_probs = get_probabilities(knn,X)









    



Fitting 10 folds for each of 50 candidates, totalling 500 fits






    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished






    



{'n_neighbors': 30, 'weights': 'uniform'}
0.848560255954



In [27]:

    
knn_cross_val_score = get_cross_validation_score(X,y,knn,5)
knn_scores = knn.predict_proba(X)[:,1]
knn_roc_score = get_roc_scores(y,knn_scores)



In [55]:

    
params_rfc = {'max_depth': np.arange(20,100,5),
             'min_samples_leaf': np.arange(90,200,5),
             'n_estimators': [20],
             'criterion' : ['gini','entropy']
                }
gs1 = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1.fit(X,y)
print gs1.best_params_
print gs1.best_score_









    



Fitting 10 folds for each of 704 candidates, totalling 7040 fits






    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 7040 out of 7040 | elapsed:  3.1min finished






    



{'n_estimators': 20, 'criterion': 'gini', 'max_depth': 85, 'min_samples_leaf': 100}
0.631050275454

Now we will explore how a RandomForest classifier does with our dataset



In [29]:

    
rf = gs1.best_estimator_
rf.fit(X,y)

rf_cross_val_score = get_cross_validation_score(X,y,rf,5)
rf_scores = rf.predict_proba(X)[:,1]
rf_roc_score = get_roc_scores(y,rf_scores)



In [30]:

    
print "Logistic Regression Cross-validation Score: ", cross_val_score
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score
print "RandomForest Cross-validation Score: ", rf_cross_val_score

print "Logistic Regressions ROC AUC score: ", roc_score
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score
print "RandomForest ROC AUC score: ", rf_roc_score









    



Logistic Regression Cross-validation Score:  0.8499573333
K Nearest Neighbors Cross-validation Score:  0.848535109954
RandomForest Cross-validation Score:  0.848179238068
Logistic Regressions ROC AUC score:  0.797464130142
K Nearest Neighbors ROC AUC score:  0.714784820116
RandomForest ROC AUC score:  0.598521135193

Prediction of a given URL



In [31]:

    
url, taglist = get_article_tags('http://www.atlasobscura.com/articles/the-ao-exit-interview-12-years-in-the-blue-man-group')



In [32]:

    
transformed_article = transform_article_for_prediction(url,article_set)



In [ ]:

Refining the model

We will reproduce our results looking only at articles published in 2015 and 2016



In [33]:

    
article_set.head(1)









    Out[33]:






  
    
      
      100-wonders
      19th-century
      2016-election
      30-rock
      31-days-of-halloween
      abandoned
      abandoned-amusement-parks
      abandoned-brooklyn
      abandoned-cemetaries
      abandoned-hospitals
      ...
      wwi
      wwii
      yehlui-geological-park
      yeti
      zombies
      zoos
      published
      pageviews
      ten_thousand
      year
    
    
      tagged_url
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      www.atlasobscura.com/articles/10-little-known-beaches-to-explore-in-the-last-days-of-summer
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2015-08-01
      651.0
      0
      2015.0
    
  

1 rows × 987 columns



In [34]:

    
y1 = article_set[article_set.year >= 2016].ten_thousand
X1 = article_set[article_set.year >= 2016].drop(['pageviews','published','ten_thousand'],axis=1)



In [35]:

    
cross_val_score1 = get_cross_validation_score(X1,y1,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)
lr1 = linear_model.LogisticRegression(penalty = 'l1').fit(X1,y1)
lr_scores1 = lr1.predict_proba(X1)[:,1]
roc_score1 = get_roc_scores(y1,lr_scores1)
print roc_score1









    



0.82455608751

Now we will rebuild our model to have it predict if an article will receive over 500 Facebook shares.



In [36]:

    
simplereach = pd.read_csv('~/Downloads/all-content-simplereach.csv')



In [37]:

    
simplereach.Url = simplereach.Url.apply(get_simplereach_url)
simplereach = simplereach.set_index('Url')
simplereach = simplereach[['Avg Engaged Time','Social Actions','Facebook Shares','FaceBook Referrals']]



In [38]:

    
article_set2 = article_set.join(simplereach['Facebook Shares'])



In [39]:

    
article_set2['five_hundred_shares'] = target_pageview_cutoff(500,article_set2['Facebook Shares'])

Logistic Regression with FB Shares > 500 as target



In [48]:

    
y2 = article_set2.five_hundred_shares
X2 = article_set2.drop(['pageviews',
                        'published',
                        'ten_thousand',
                        'Facebook Shares',
                        'five_hundred_shares'
                           ],axis=1)

cross_val_score_social = get_cross_validation_score(X2,y2,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)
lr_social = linear_model.LogisticRegression(penalty = 'l1').fit(X2,y2)
lr_scores_social = lr_social.predict_proba(X2)[:,1]
roc_score_social = get_roc_scores(y2,lr_scores_social)
print "Cross-val score when predicting Facebook shares > 500: ", cross_val_score_social
print "ROC AUC score when predicting Facebook shares > 500: ",roc_score_social









    



 Cross-val score when predicting Facebook shares > 500:  0.907193289634
ROC AUC score when predicting Facebook shares > 500:  0.836639925702



In [41]:

    
url = 'http://www.atlasobscura.com/articles/winters-effigies-the-deviant-history-of-the-snowman'
lr2.predict(transform_article_for_prediction(url,X2))









    Out[41]:





array([0])

KNN with FB Shares > 500 as target



In [50]:

    
params_social = {'n_neighbors' : [x for x in range(2,100,4)],
          'weights' : ['distance','uniform']}
gs_social = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
                  n_jobs=-1,cv=10,verbose=1)
gs_social.fit(X2,y2)
print gs_social.best_params_
print gs_social.best_score_

knn_social = gs_social.best_estimator_.fit(X2,y2)
knn_probs_social = get_probabilities(knn_social,X2)
knn_cross_val_score_social = get_cross_validation_score(X2,y2,knn_social,5)
knn_scores_social = knn_social.predict_proba(X2)[:,1]
knn_roc_score_social = get_roc_scores(y2,knn_scores_social)









    



Fitting 10 folds for each of 50 candidates, totalling 500 fits






    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished






    



 {'n_neighbors': 14, 'weights': 'uniform'}
0.91361535727

RandomForest with FB Shares > 500 as target



In [43]:

    
params_rfc = {'max_depth': np.arange(20,100,5),
             'min_samples_leaf': np.arange(90,200,5),
             'n_estimators': [20]}
gs1_social = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1_social.fit(X2,y2)

rf_social = gs1_social.best_estimator_
rf_social.fit(X2,y2)
rf_cross_val_score_social = get_cross_validation_score(X2,y2,rf_social,5)
rf_scores_social = rf_social.predict_proba(X)[:,1]
rf_roc_score_social = get_roc_scores(y2,rf_scores_social)









    



Fitting 10 folds for each of 352 candidates, totalling 3520 fits






    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3520 out of 3520 | elapsed:  1.5min finished



In [52]:

    
print gs1_social.best_params_
print gs1_social.best_score_









    



{'n_estimators': 20, 'max_depth': 25, 'min_samples_leaf': 115}
0.6579705052



In [53]:

    
print "Logistic Regression Cross-validation Score: ", cross_val_score_social
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score_social
print "RandomForest Cross-validation Score: ", rf_cross_val_score_social

print "Logistic Regressions ROC AUC score: ", roc_score_social
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score_social
print "RandomForest ROC AUC score: ", rf_roc_score_social









    



Logistic Regression Cross-validation Score:  0.907193289634
K Nearest Neighbors Cross-validation Score:  0.913595823088
RandomForest Cross-validation Score:  0.913595823088
Logistic Regressions ROC AUC score:  0.836639925702
K Nearest Neighbors ROC AUC score:  0.786659140766
RandomForest ROC AUC score:  0.562823653745



In [45]:

    
np.mean(y)









    Out[45]:





0.15179523640241735



In [46]:

    
simplereach.describe()









    Out[46]:






  
    
      
      Avg Engaged Time
      Social Actions
      Facebook Shares
      FaceBook Referrals
    
  
  
    
      count
      14898.000000
      14898.000000
      14898.000000
      14898.000000
    
    
      mean
      88.869915
      391.969929
      61.045778
      893.643845
    
    
      std
      57.559737
      2632.323342
      444.649227
      7870.629805
    
    
      min
      5.000000
      -19.000000
      -1.000000
      0.000000
    
    
      25%
      62.380740
      1.000000
      0.000000
      0.000000
    
    
      50%
      77.141359
      11.000000
      2.000000
      5.000000
    
    
      75%
      97.413157
      119.000000
      14.000000
      105.000000
    
    
      max
      1360.000000
      201271.000000
      37480.000000
      453648.000000



In [ ]:

	tag_id	tag_url	tagged_type	tagged_id	tagged_url
2	2	www.atlasobscura.com/categories/panoramas	Place	6431	www.atlasobscura.com/places/gettysburg-cyclorama
3	2	www.atlasobscura.com/categories/panoramas	Article	2311	www.atlasobscura.com/articles/rip-gettysburg-c...
4	258	www.atlasobscura.com/categories/bridges	Place	10134	www.atlasobscura.com/places/gimbel-s-bridge

	pageviews
count	3446.000000
mean	7052.891759
std	23256.215270
min	1.000000
25%	1150.250000
50%	2571.500000
75%	5834.750000
max	621494.000000

	100-wonders	19th-century	2016-election	30-rock	31-days-of-halloween	abandoned	abandoned-amusement-parks	abandoned-brooklyn	abandoned-cemetaries	abandoned-hospitals	...	wwi	wwii	yehlui-geological-park	yeti	zombies	zoos	published	pageviews	ten_thousand	year
tagged_url
www.atlasobscura.com/articles/10-little-known-beaches-to-explore-in-the-last-days-of-summer	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	2015-08-01	651.0	0	2015.0

	Avg Engaged Time	Social Actions	Facebook Shares	FaceBook Referrals
count	14898.000000	14898.000000	14898.000000	14898.000000
mean	88.869915	391.969929	61.045778	893.643845
std	57.559737	2632.323342	444.649227	7870.629805
min	5.000000	-19.000000	-1.000000	0.000000
25%	62.380740	1.000000	0.000000	0.000000
50%	77.141359	11.000000	2.000000	5.000000
75%	97.413157	119.000000	14.000000	105.000000
max	1360.000000	201271.000000	37480.000000	453648.000000